# Duomenys
# https://www.kaggle.com/datasets/brajeshmohapatra/bike-count-prediction-data-set?select=train.csv
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5 ✓ purrr 0.3.4
## ✓ tibble 3.1.6 ✓ dplyr 1.0.7
## ✓ tidyr 1.1.4 ✓ stringr 1.4.0
## ✓ readr 2.1.1 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(ggplot2)
library(reshape2)
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
library(AER)
## Loading required package: car
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
## The following object is masked from 'package:purrr':
##
## some
## Loading required package: lmtest
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
## Loading required package: sandwich
## Loading required package: survival
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
d <- read.csv("train.csv")
d <- dplyr::select(d, -c(datetime, casual, registered))
head(d)
## season holiday workingday weather temp atemp humidity windspeed count
## 1 1 0 0 1 9.84 14.395 81 0.0000 16
## 2 1 0 0 1 9.02 13.635 80 0.0000 40
## 3 1 0 0 1 9.02 13.635 80 0.0000 32
## 4 1 0 0 1 9.84 14.395 75 0.0000 13
## 5 1 0 0 1 9.84 14.395 75 0.0000 1
## 6 1 0 0 2 9.84 12.880 75 6.0032 1
summary(d)
## season holiday workingday weather
## Min. :1.000 Min. :0.0000 Min. :0.000 Min. :1.000
## 1st Qu.:1.000 1st Qu.:0.0000 1st Qu.:0.000 1st Qu.:1.000
## Median :2.000 Median :0.0000 Median :1.000 Median :1.000
## Mean :2.211 Mean :0.0275 Mean :0.686 Mean :1.427
## 3rd Qu.:3.000 3rd Qu.:0.0000 3rd Qu.:1.000 3rd Qu.:2.000
## Max. :4.000 Max. :1.0000 Max. :1.000 Max. :4.000
## temp atemp humidity windspeed
## Min. : 0.82 Min. : 0.00 Min. : 0.00 Min. : 0.000
## 1st Qu.:13.12 1st Qu.:15.91 1st Qu.: 47.00 1st Qu.: 7.002
## Median :19.68 Median :23.48 Median : 62.00 Median :12.998
## Mean :19.73 Mean :23.11 Mean : 62.36 Mean :13.142
## 3rd Qu.:26.24 3rd Qu.:30.30 3rd Qu.: 79.00 3rd Qu.:19.001
## Max. :40.18 Max. :50.00 Max. :100.00 Max. :56.997
## count
## Min. : 1.0
## 1st Qu.: 35.0
## Median :124.0
## Mean :167.6
## 3rd Qu.:245.0
## Max. :957.0
ggplot(melt(d, "count"), aes(x = value, y = count, colour = variable)) +
geom_point() +
facet_wrap(~variable, scales = "free", nrow = 4)

### Poisson
m1 <- glm(count ~ ., family="poisson", data=d)
summary(m1)
##
## Call:
## glm(formula = count ~ ., family = "poisson", data = d)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -27.345 -9.535 -2.690 4.545 41.151
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 4.782e+00 3.983e-03 1200.614 <2e-16 ***
## season 8.928e-03 7.960e-04 11.216 <2e-16 ***
## holiday -1.543e-01 4.662e-03 -33.092 <2e-16 ***
## workingday -1.327e-02 1.518e-03 -8.740 <2e-16 ***
## weather -1.260e-02 1.288e-03 -9.781 <2e-16 ***
## temp -1.618e-02 6.468e-04 -25.010 <2e-16 ***
## atemp 5.846e-02 5.958e-04 98.126 <2e-16 ***
## humidity -1.384e-02 4.098e-05 -337.640 <2e-16 ***
## windspeed 4.761e-03 8.596e-05 55.391 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 1911146 on 12979 degrees of freedom
## Residual deviance: 1387988 on 12971 degrees of freedom
## AIC: 1469276
##
## Number of Fisher Scoring iterations: 5
cat("Deviacija padalinta is laisves laipsniu: ",m1$deviance / m1$df.residual)
## Deviacija padalinta is laisves laipsniu: 107.007
cat("Turi buti tarp 0.7 ir 1.3, tad nebegalime naudoti puasono modelio")
## Turi buti tarp 0.7 ir 1.3, tad nebegalime naudoti puasono modelio
dispersiontest(m1)
##
## Overdispersion test
##
## data: m1
## z = 51.787, p-value < 2.2e-16
## alternative hypothesis: true dispersion is greater than 1
## sample estimates:
## dispersion
## 114.8725
### Negative Binomial
m2 <- glm.nb(count ~ ., data = d)
summary(m2)
##
## Call:
## glm.nb(formula = count ~ ., data = d, init.theta = 1.030517395,
## link = log)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.9313 -0.9845 -0.2273 0.3475 3.5581
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 4.5457388 0.0494949 91.843 < 2e-16 ***
## season 0.0179194 0.0092530 1.937 0.052793 .
## holiday -0.1578975 0.0549495 -2.874 0.004059 **
## workingday 0.0997106 0.0193834 5.144 2.69e-07 ***
## weather 0.0060803 0.0151710 0.401 0.688580
## temp -0.0303578 0.0092194 -3.293 0.000992 ***
## atemp 0.0769779 0.0084421 9.118 < 2e-16 ***
## humidity -0.0145218 0.0005224 -27.796 < 2e-16 ***
## windspeed 0.0043411 0.0011602 3.742 0.000183 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for Negative Binomial(1.0305) family taken to be 1)
##
## Null deviance: 18362 on 12979 degrees of freedom
## Residual deviance: 14867 on 12971 degrees of freedom
## AIC: 155611
##
## Number of Fisher Scoring iterations: 1
##
##
## Theta: 1.0305
## Std. Err.: 0.0116
##
## 2 x log-likelihood: -155591.0130
cat("Deviacija padalinta is laisves laipsniu: ",m2$deviance / m2$df.residual)
## Deviacija padalinta is laisves laipsniu: 1.146175
### Stepwise
m2step <- stepAIC(m2, direction = "both")
## Start: AIC=155609
## count ~ season + holiday + workingday + weather + temp + atemp +
## humidity + windspeed
##
## Df AIC
## - weather 1 155607
## <none> 155609
## - season 1 155611
## - holiday 1 155615
## - temp 1 155617
## - windspeed 1 155621
## - workingday 1 155633
## - atemp 1 155682
## - humidity 1 156325
##
## Step: AIC=155607.2
## count ~ season + holiday + workingday + temp + atemp + humidity +
## windspeed
##
## Df AIC
## <none> 155607
## + weather 1 155609
## - season 1 155609
## - holiday 1 155613
## - temp 1 155615
## - windspeed 1 155619
## - workingday 1 155631
## - atemp 1 155680
## - humidity 1 156506
summary(m2step)
##
## Call:
## glm.nb(formula = count ~ season + holiday + workingday + temp +
## atemp + humidity + windspeed, data = d, init.theta = 1.030507956,
## link = log)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.9261 -0.9854 -0.2276 0.3477 3.5582
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 4.5486115 0.0489952 92.838 < 2e-16 ***
## season 0.0177488 0.0092342 1.922 0.054597 .
## holiday -0.1580013 0.0549497 -2.875 0.004035 **
## workingday 0.0998662 0.0193611 5.158 2.49e-07 ***
## temp -0.0302577 0.0092158 -3.283 0.001026 **
## atemp 0.0768636 0.0084370 9.110 < 2e-16 ***
## humidity -0.0144238 0.0004649 -31.026 < 2e-16 ***
## windspeed 0.0043896 0.0011492 3.820 0.000134 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for Negative Binomial(1.0305) family taken to be 1)
##
## Null deviance: 18362 on 12979 degrees of freedom
## Residual deviance: 14867 on 12972 degrees of freedom
## AIC: 155609
##
## Number of Fisher Scoring iterations: 1
##
##
## Theta: 1.0305
## Std. Err.: 0.0116
##
## 2 x log-likelihood: -155591.1600
cat("Deviacija padalinta is laisves laipsniu: ",m2step$deviance / m2step$df.residual)
## Deviacija padalinta is laisves laipsniu: 1.146088
# Anova between models
anova(m2, m2step)
## Likelihood ratio tests of Negative Binomial Models
##
## Response: count
## Model
## 1 season + holiday + workingday + temp + atemp + humidity + windspeed
## 2 season + holiday + workingday + weather + temp + atemp + humidity + windspeed
## theta Resid. df 2 x log-lik. Test df LR stat. Pr(Chi)
## 1 1.030508 12972 -155591.2
## 2 1.030517 12971 -155591.0 1 vs 2 1 0.1463433 0.7020546
# Koeficientai
est <- cbind(Estimate = coef(m2step), confint(m2step))
## Waiting for profiling to be done...
exp(est)
## Estimate 2.5 % 97.5 %
## (Intercept) 94.5011015 85.4811031 104.5054522
## season 1.0179073 1.0005701 1.0356400
## holiday 0.8538486 0.7677649 0.9525380
## workingday 1.1050231 1.0636883 1.1477461
## temp 0.9701955 0.9519689 0.9887905
## atemp 1.0798948 1.0612557 1.0988513
## humidity 0.9856797 0.9847646 0.9865953
## windspeed 1.0043992 1.0020931 1.0067166
library(ggplot2)
theme_set(theme_minimal())
### Prediction
dopred <- function(tt, model) {
tt$count <- tt$casual + tt$registered
index <- tt$index
real <- tt$count
tt <- dplyr::select(tt, -c(datetime, casual, registered, count))
predicted <- predict(m2step, newdata = tt, type = "response")
tempdf <- data.frame(index, real, predicted)
p<- ggplot(tempdf, aes(x=index)) +
geom_line(aes(y = real), color = "#0F9D58") +
geom_line(aes(y = predicted), color="#4285F4", linetype="twodash")
return(p)
}
test <- read.csv("test.csv")
test$index <- 1:nrow(test)
num_groups = 8
totest <- test %>%
group_by((row_number()-1) %/% (n()/num_groups)) %>%
nest %>% pull(data)
head(test)
## datetime season holiday workingday weather temp atemp humidity
## 1 2012-06-30 1:00:00 3 0 0 3 26.24 28.790 89
## 2 2012-06-30 2:00:00 3 0 0 2 26.24 28.790 89
## 3 2012-06-30 3:00:00 3 0 0 2 26.24 28.790 89
## 4 2012-06-30 4:00:00 3 0 0 2 25.42 27.275 94
## 5 2012-06-30 5:00:00 3 0 0 1 26.24 28.790 89
## 6 2012-06-30 6:00:00 3 0 0 1 26.24 28.790 89
## windspeed casual registered index
## 1 15.0013 3 55 1
## 2 0.0000 7 54 2
## 3 0.0000 3 20 3
## 4 0.0000 3 15 4
## 5 11.0014 3 7 5
## 6 11.0014 6 36 6
plots <- list() # new empty list
for (i in 1:8) {
p1 = dopred(data.frame(totest[i]))
plots[[i]] <- p1 # add each plot into plot list
}
plots
## [[1]]

##
## [[2]]

##
## [[3]]

##
## [[4]]

##
## [[5]]

##
## [[6]]

##
## [[7]]

##
## [[8]]
